import nltk
from nltk.probability import FreqDist
from nucular import Nucular
from operator import itemgetter

SIMPLE_ARCHIVE = 'c:/users/vaidhy/Development/scipy/archive/simple'

def getResults(w):
    archive = Nucular.Nucular(SIMPLE_ARCHIVE)
    q = archive.Query()
    q.anyWord(w)
    results = q.resultDictionaries()
    return results

def findColocations(w):
    results = getResults(w)
    words = []
    for d in results:
        sentences = nltk.sent_tokenize(d['content'])
        for sentence in sentences:
            words.extend(nltk.word_tokenize(sentence))
    
    wfd = FreqDist(words)
    pfd = FreqDist(tuple(words[i:i+2]) for i in range(len(words)-1))
    scored = [((w1,w2), score(w1, w2, wfd, pfd)) for w1, w2 in pfd]
    scored.sort(key=itemgetter(1), reverse=True)
    for w1,w2 in map(itemgetter(0), scored[:15]):
        print w1 + ' ' + w2

def score(word1, word2, wfd, pfd, power=3):
    freq1 = wfd[word1]
    freq2 = wfd[word2]
    freq12 = pfd[(word1, word2)]
    return freq12 ** power / float(freq1 * freq2)
        
if __name__ == "__main__":   
    findColocations('parseltongue')
